import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from warnings import filterwarnings
filterwarnings("ignore")
data=pd.read_csv(r"C:\Users\laxma\Downloads\Fake News Detection Dataset.csv")
data.shape
(4500, 6)
data.head()
| ID | Word_Count | Number_of_Sentence | Unique_Words | Average_Word_Length | Label | |
|---|---|---|---|---|---|---|
| 0 | 1606 | 10 | 4 | 24 | 6.176750 | 1 |
| 1 | 3718 | 10 | 8 | 25 | 5.826770 | 1 |
| 2 | 2634 | 10 | 7 | 18 | 4.619040 | 1 |
| 3 | 5560 | 10 | 6 | 18 | 4.961424 | 1 |
| 4 | 7494 | 10 | 4 | 21 | 4.114324 | 1 |
data.tail()
| ID | Word_Count | Number_of_Sentence | Unique_Words | Average_Word_Length | Label | |
|---|---|---|---|---|---|---|
| 4495 | 1179 | 41 | 7 | 12 | 6.963924 | 0 |
| 4496 | 9445 | 100 | 5 | 15 | 3.136755 | 1 |
| 4497 | 4149 | 100 | 8 | 18 | 3.376823 | 1 |
| 4498 | 9877 | 85 | 14 | 42 | 5.331393 | 0 |
| 4499 | 6709 | 57 | 6 | 7 | 4.312751 | 0 |
data.isnull().sum()
ID 0 Word_Count 0 Number_of_Sentence 0 Unique_Words 0 Average_Word_Length 0 Label 0 dtype: int64
data.duplicated().sum()
0
data.describe()
| ID | Word_Count | Number_of_Sentence | Unique_Words | Average_Word_Length | Label | |
|---|---|---|---|---|---|---|
| count | 4500.000000 | 4500.000000 | 4500.000000 | 4500.000000 | 4500.000000 | 4500.000000 |
| mean | 5469.140000 | 53.934000 | 8.934667 | 24.943333 | 4.968105 | 0.329556 |
| std | 2599.193059 | 24.872743 | 3.407847 | 11.540708 | 1.152394 | 0.470104 |
| min | 1002.000000 | 10.000000 | 4.000000 | 5.000000 | 3.000385 | 0.000000 |
| 25% | 3228.750000 | 35.000000 | 6.000000 | 17.000000 | 3.980553 | 0.000000 |
| 50% | 5449.500000 | 52.000000 | 9.000000 | 22.000000 | 4.906200 | 0.000000 |
| 75% | 7706.750000 | 75.000000 | 12.000000 | 33.000000 | 5.996111 | 1.000000 |
| max | 9999.000000 | 100.000000 | 15.000000 | 50.000000 | 6.999799 | 1.000000 |
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4500 entries, 0 to 4499 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 4500 non-null int64 1 Word_Count 4500 non-null int64 2 Number_of_Sentence 4500 non-null int64 3 Unique_Words 4500 non-null int64 4 Average_Word_Length 4500 non-null float64 5 Label 4500 non-null int64 dtypes: float64(1), int64(5) memory usage: 211.1 KB
data.columns
Index(['ID', 'Word_Count', 'Number_of_Sentence', 'Unique_Words',
'Average_Word_Length', 'Label'],
dtype='object')
#VISUALIZATION
plt.scatter(data['ID'],data['Number_of_Sentence'],color='yellowgreen')
plt.xticks(rotation=90)
plt.show()
fig=px.violin(data,x='Word_Count',y='Unique_Words',color='Word_Count')
fig.show()
plt.bar(data['Label'],data['ID'])
plt.xticks(rotation=90)
plt.show()
plt.figure(figsize=(10,4))
sns.countplot(x='Word_Count', data=data, color='b')
plt.xticks(rotation=90)
plt.show()
plt.figure(figsize=(10,4))
top_car = data['Label'].value_counts().nlargest(10)
sns.countplot(y=data.Label,color='red')
<AxesSubplot:xlabel='count', ylabel='Label'>
plt.figure(figsize=(8, 4))
sns.scatterplot(data=data, x='Label', y='Word_Count')
plt.xlabel('Label')
plt.ylabel('Word_Count')
plt.show()
sns.barplot(data['Unique_Words'],data['Average_Word_Length'],color='cyan')
plt.xticks(rotation=90)
plt.show()
sns.displot(data["Label"])
<seaborn.axisgrid.FacetGrid at 0x17cc89784c0>
sns.lineplot(x='Number_of_Sentence', y='Label', data=data)
<AxesSubplot:xlabel='Number_of_Sentence', ylabel='Label'>
sns.countplot(x='Unique_Words',data=data)
<AxesSubplot:xlabel='Unique_Words', ylabel='count'>
#MODEL BUILDING
y = data["Word_Count"].values
x_data=data.drop(["Word_Count"],axis=1)
x = (x_data-np.min(x_data))/(np.max(x_data)-np.min(x_data))
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2,random_state=1)
from sklearn.svm import SVC
svm=SVC(random_state=1)
svm.fit(x_train,y_train)
print("train accuracy:",svm.score(x_train,y_train))
print("test accuracy:",svm.score(x_test,y_test))
train accuracy: 0.08583333333333333 test accuracy: 0.05333333333333334